NOTE: We had some issues trying to merge our two documents, this document in additional to our original submit should be considered the project as a whole. Some EDA is included in part 1, the original submission as well as the PCA section and some modeling. This document contains the K-Means clustering section as well as some additional EDA and modeling.
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
library(tidytext)
library(caret)
library(fastDummies)
library(randomForest)
library(broom)
# https://www.openml.org/d/1590
raw_income = read_csv("./openml_1590.csv", na=c("?"))
income = read_csv("./openml_1590.csv", na=c("?")) %>%
drop_na() %>%
mutate(income_above_50k = class==">50K") %>%
select(-class) %>%
dummy_cols(remove_selected_columns = T)
head(income)
##preprocessing before K-means clustering
#preprocessing data
income_scaled <- income %>%
mutate(age=scale(age), fnlwgt=scale(fnlwgt), `education-num`=scale(`education-num`), `capital-gain`=scale(`capital-gain`), `capital-loss`=scale(`capital-loss`), `hours-per-week`=scale(`hours-per-week`))
##Basic K-means clustering
#setting # of clusters to 3 (that's K)
kclust <- kmeans(income_scaled, centers = 3)
kclust$centers
## age fnlwgt education-num capital-gain capital-loss hours-per-week
## 1 0.2445702 -0.01456681 0.3582008 -0.1467316 4.4693477 0.2328848
## 2 0.5105878 -0.10131843 0.1756853 0.1340072 -0.2178724 0.3515602
## 3 -0.5502390 0.10583335 -0.2163851 -0.1235900 -0.2167648 -0.3852285
## income_above_50k workclass_Federal-gov workclass_Local-gov workclass_Private
## 1 0.52265141 0.03862661 0.08154506 0.6742966
## 2 0.42718757 0.03806260 0.08192826 0.6461503
## 3 0.03592279 0.02316384 0.05348399 0.8357815
## workclass_Self-emp-inc workclass_Self-emp-not-inc workclass_State-gov
## 1 0.063900811 0.10109680 0.04005722
## 2 0.059401416 0.12890107 0.04496230
## 3 0.009981168 0.03592279 0.04133710
## workclass_Without-pay education_1st-4th education_5th-6th education_7th-8th
## 1 0.0004768717 0.002861230 0.007153076 0.01144492
## 2 0.0005940142 0.003975326 0.006899703 0.02010509
## 3 0.0003295669 0.006073446 0.013323917 0.01690207
## education_9th education_10th education_11th education_12th
## 1 0.006199332 0.01812113 0.02098236 0.004768717
## 2 0.011286269 0.02005940 0.02042495 0.007950651
## 3 0.019585687 0.03512241 0.05310734 0.018502825
## education_Assoc-acdm education_Assoc-voc education_Bachelors
## 1 0.03481164 0.03433476 0.2360515
## 2 0.03262509 0.04742975 0.1996345
## 3 0.03389831 0.03997175 0.1274011
## education_Doctorate education_HS-grad education_Masters education_Preschool
## 1 0.030519790 0.2594182 0.10157368 0.0009537434
## 2 0.020333562 0.3148275 0.08147133 0.0007310944
## 3 0.001647834 0.3459981 0.02438795 0.0025423729
## education_Prof-school education_Some-college marital-status_Divorced
## 1 0.043395327 0.1874106 0.10586552
## 2 0.028695454 0.1835504 0.09262052
## 3 0.003107345 0.2584275 0.19058380
## marital-status_Married-AF-spouse marital-status_Married-civ-spouse
## 1 0.0004768717 0.63900811
## 2 0.0004569340 0.81430203
## 3 0.0009887006 0.08917137
## marital-status_Married-spouse-absent marital-status_Never-married
## 1 0.008106819 0.20505484
## 2 0.007904958 0.04340873
## 3 0.017043315 0.62231638
## marital-status_Separated marital-status_Widowed occupation_Adm-clerical
## 1 0.01859800 0.02288984 0.08631378
## 2 0.01306831 0.02823852 0.06598127
## 3 0.05112994 0.02876648 0.18432203
## occupation_Armed-Forces occupation_Craft-repair occupation_Exec-managerial
## 1 0.0004768717 0.12303290 0.20553171
## 2 0.0002284670 0.17363491 0.17756454
## 3 0.0003766478 0.09237288 0.07848399
## occupation_Farming-fishing occupation_Handlers-cleaners
## 1 0.02193610 0.02527420
## 2 0.04235778 0.02851268
## 3 0.02387006 0.06445386
## occupation_Machine-op-inspct occupation_Other-service
## 1 0.04673343 0.05007153
## 2 0.05926434 0.04482522
## 3 0.07415254 0.17523540
## occupation_Priv-house-serv occupation_Prof-specialty
## 1 0.002384359 0.19980925
## 2 0.001782042 0.16399360
## 3 0.008851224 0.09416196
## occupation_Protective-serv occupation_Sales occupation_Tech-support
## 1 0.02145923 0.1335241 0.03576538
## 2 0.02737034 0.1168380 0.02791867
## 3 0.01563089 0.1210452 0.03455744
## occupation_Transport-moving relationship_Husband relationship_Not-in-family
## 1 0.04768717 0.57319981 0.2288984
## 2 0.06972812 0.76559287 0.1256568
## 3 0.03248588 0.03338041 0.3988701
## relationship_Other-relative relationship_Own-child relationship_Unmarried
## 1 0.01764425 0.07057701 0.04864092
## 2 0.00941284 0.01055517 0.04473384
## 3 0.05207156 0.29411488 0.17452919
## relationship_Wife race_Amer-Indian-Eskimo race_Asian-Pac-Islander race_Black
## 1 0.06103958 0.004768717 0.03051979 0.06390081
## 2 0.04404844 0.008042038 0.02910669 0.05382682
## 3 0.04703390 0.011723164 0.02834275 0.13728814
## race_Other race_White sex_Female sex_Male native-country_Cambodia
## 1 0.005245589 0.8955651 0.2260372 0.7739628 0.0009537434
## 2 0.005026274 0.9039982 0.1168380 0.8831620 0.0006854010
## 3 0.010922787 0.8117232 0.5491525 0.4508475 0.0004237288
## native-country_Canada native-country_China native-country_Columbia
## 1 0.005722461 0.005245589 0.001430615
## 2 0.004386566 0.003061458 0.001507882
## 3 0.002589454 0.001647834 0.002165725
## native-country_Cuba native-country_Dominican-Republic native-country_Ecuador
## 1 0.002384359 0.0009537434 0.0000000000
## 2 0.003381311 0.0014621887 0.0007767878
## 3 0.002542373 0.0029661017 0.0012241055
## native-country_El-Salvador native-country_England native-country_France
## 1 0.0009537434 0.003338102 0.0004768717
## 2 0.0015992689 0.002878684 0.0010052547
## 3 0.0051789077 0.002306968 0.0006120527
## native-country_Germany native-country_Greece native-country_Guatemala
## 1 0.003338102 0.0019074869 0.0004768717
## 2 0.004340873 0.0016906557 0.0007767878
## 3 0.004284369 0.0003766478 0.0032015066
## native-country_Haiti native-country_Holand-Netherlands
## 1 0.001430615 0.0004768717
## 2 0.001096642 0.0000000000
## 3 0.001977401 0.0000000000
## native-country_Honduras native-country_Hong native-country_Hungary
## 1 0.0009537434 0.0019074869 0.0004768717
## 2 0.0001827736 0.0004569340 0.0005483208
## 3 0.0006120527 0.0006591337 0.0002354049
## native-country_India native-country_Iran native-country_Ireland
## 1 0.004291845 0.0019074869 0.0009537434
## 2 0.004249486 0.0016906557 0.0006397076
## 3 0.002118644 0.0007062147 0.0009416196
## native-country_Italy native-country_Jamaica native-country_Japan
## 1 0.001430615 0.0004768717 0.001430615
## 2 0.003152844 0.0015535755 0.002147590
## 3 0.001318267 0.0032015066 0.001836158
## native-country_Laos native-country_Mexico native-country_Nicaragua
## 1 0.0004768717 0.007629948 0.0009537434
## 2 0.0004112406 0.012519991 0.0005940142
## 3 0.0005178908 0.028860640 0.0015536723
## native-country_Outlying-US(Guam-USVI-etc) native-country_Peru
## 1 0.0000000000 0.0004768717
## 2 0.0004112406 0.0006397076
## 3 0.0006120527 0.0014124294
## native-country_Philippines native-country_Poland native-country_Portugal
## 1 0.006199332 0.0009537434 0.000000000
## 2 0.006488462 0.0022846699 0.001644962
## 3 0.006026365 0.0013653484 0.001224105
## native-country_Puerto-Rico native-country_Scotland native-country_South
## 1 0.003814974 0.0000000000 0.002861230
## 2 0.002513137 0.0003655472 0.002330363
## 3 0.005273070 0.0005649718 0.002071563
## native-country_Taiwan native-country_Thailand native-country_Trinadad&Tobago
## 1 0.001430615 0.0004768717 0.0009537434
## 2 0.001325109 0.0007767878 0.0005026274
## 3 0.001082863 0.0005178908 0.0006120527
## native-country_United-States native-country_Vietnam native-country_Yugoslavia
## 1 0.9289461 0.001907487 0.0000000000
## 2 0.9221385 0.001142335 0.0006397076
## 3 0.9022128 0.002542373 0.0004237288
glance(kclust)
##Add Clusters to original dataset
incomek <- augment(kclust,income_scaled)
head(incomek)
##Visualize Clusters
incomek %>%
pivot_longer(c(age, fnlwgt, `education-num`),names_to = "feature") %>%
ggplot(aes(value, fill=.cluster))+
geom_density(alpha=0.3)+
facet_wrap(~feature)
incomek %>%
pivot_longer(c(`capital-gain`,`capital-loss`),names_to = "feature") %>%
ggplot(aes(value, fill=.cluster))+
geom_density(alpha=0.3)+
facet_wrap(~feature)
incomek %>%
pivot_longer(c(`hours-per-week`,workclass_Private),names_to = "feature") %>%
ggplot(aes(value, fill=.cluster))+
geom_density(alpha=0.3)+
facet_wrap(~feature)
incomek %>%
pivot_longer(c(`hours-per-week`,income_above_50k),names_to = "feature") %>%
ggplot(aes(value, fill=.cluster))+
geom_density(alpha=0.3)+
facet_wrap(~feature)
##Try different numbers of clusters
kclusts <- tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~kmeans(income_scaled, .x)),
glanced = map(kclust, glance),
augmented = map(kclust, augment, income_scaled)
)
##Plot the different clusters on two axes
assignments <- kclusts %>%
unnest(augmented)
ggplot(assignments, aes(`hours-per-week`, `education-num`)) +
geom_point(aes(color = .cluster), alpha=0.3) +
facet_wrap(~ k)
##Look at improvement in within-cluster error
#can still look for elbow (looks like about 7)
clusterings <- kclusts %>%
unnest(glanced, .drop = TRUE)
ggplot(clusterings, aes(k, tot.withinss)) +
geom_line()
# Run k-means with the optimal number of clusters
optimal_k <- 7
kclust_optimal <- kmeans(income_scaled, centers = optimal_k)
# Check the cluster centroids
kclust_optimal$centers
## age fnlwgt education-num capital-gain capital-loss hours-per-week
## 1 -0.3260702 1.94211126 -0.3182962 -0.09977414 -0.2149966 0.02428684
## 2 0.2457380 -0.01610466 0.3575001 -0.14673158 4.4763810 0.23367710
## 3 0.2777174 -0.29324941 -0.4980678 -0.08213813 -0.2185747 0.46300633
## 4 -1.0386738 -0.18899411 -0.4609996 -0.13148898 -0.2181556 -0.59823451
## 5 -0.2743263 -0.18693331 1.1198578 -0.09321095 -0.2141737 0.07166303
## 6 0.4832941 -0.12090689 1.2888463 0.70732206 -0.2187778 0.47199782
## 7 0.9706938 -0.26987299 -0.5210640 -0.10111811 -0.2138329 -0.43661591
## income_above_50k workclass_Federal-gov workclass_Local-gov workclass_Private
## 1 0.16286939 0.03338597 0.05752312 0.7947214
## 2 0.52465294 0.03877453 0.08137865 0.6744854
## 3 0.31095737 0.02776739 0.05562827 0.7094241
## 4 0.01182998 0.01413317 0.03287270 0.8849456
## 5 0.17673533 0.03652415 0.12636316 0.6794184
## 6 0.74767961 0.04761086 0.09161224 0.5787212
## 7 0.08458510 0.03747995 0.07335569 0.7353070
## workclass_Self-emp-inc workclass_Self-emp-not-inc workclass_State-gov
## 1 0.018723212 0.05819986 0.03722084
## 2 0.064145524 0.10148396 0.03925323
## 3 0.046933433 0.13257292 0.02739342
## 4 0.007747069 0.02931323 0.03036013
## 5 0.022503029 0.05816168 0.07702960
## 6 0.098487453 0.11962874 0.06359574
## 7 0.021875456 0.08691848 0.04389675
## workclass_Without-pay education_1st-4th education_5th-6th education_7th-8th
## 1 0.0002255809 0.011279044 0.0187232123 0.018948793
## 2 0.0004786979 0.002872188 0.0071804691 0.011488751
## 3 0.0002804787 0.006731488 0.0132759910 0.032909499
## 4 0.0006281407 0.003454774 0.0082705193 0.009526801
## 5 0.0000000000 0.000000000 0.0000000000 0.000000000
## 6 0.0003437607 0.000000000 0.0001718804 0.000000000
## 7 0.0011666910 0.008896019 0.0188128919 0.039667493
## education_9th education_10th education_11th education_12th
## 1 0.0218813445 0.0315813219 0.03654410 0.016241823
## 2 0.0062230732 0.0181905218 0.02106271 0.004786979
## 3 0.0209424084 0.0353403141 0.03552730 0.011686612
## 4 0.0174832496 0.0433417085 0.08207705 0.029522613
## 5 0.0000000000 0.0000000000 0.00000000 0.000000000
## 6 0.0001718804 0.0003437607 0.00000000 0.000000000
## 7 0.0253755287 0.0366049293 0.03631326 0.012833601
## education_Assoc-acdm education_Assoc-voc education_Bachelors
## 1 0.03699526 0.04692082 0.077599820
## 2 0.03494495 0.03446625 0.235519387
## 3 0.01318250 0.05908751 0.000000000
## 4 0.01298157 0.02638191 0.006490787
## 5 0.09970573 0.07339450 0.590964168
## 6 0.05551736 0.01667240 0.541938811
## 7 0.01545866 0.03995917 0.015312819
## education_Doctorate education_HS-grad education_Masters education_Preschool
## 1 0.0013534852 0.3841642229 0.0126325288 0.0018046470
## 2 0.0301579703 0.2589755864 0.1019626616 0.0009573959
## 3 0.0000000000 0.5039267016 0.0000000000 0.0015893792
## 4 0.0000000000 0.3987646566 0.0002093802 0.0026172529
## 5 0.0205989268 0.0005193007 0.1660031158 0.0000000000
## 6 0.0608456514 0.0068752149 0.2138191818 0.0000000000
## 7 0.0002916727 0.4808225171 0.0058334549 0.0029167274
## education_Prof-school education_Some-college marital-status_Divorced
## 1 0.0013534852 0.28197609 0.15294383
## 2 0.0435615127 0.18764959 0.10483485
## 3 0.0000000000 0.26580030 0.02729993
## 4 0.0000000000 0.35887772 0.06574539
## 5 0.0289077376 0.01990653 0.22208759
## 6 0.0890340323 0.01460983 0.03162599
## 7 0.0004375091 0.26046376 0.43940499
## marital-status_Married-AF-spouse marital-status_Married-civ-spouse
## 1 0.0011279044 0.41597113
## 2 0.0004786979 0.64145524
## 3 0.0004674645 0.93866866
## 4 0.0011515913 0.04742462
## 5 0.0006924009 0.10628354
## 6 0.0005156411 0.93709178
## 7 0.0004375091 0.19133732
## marital-status_Married-spouse-absent marital-status_Never-married
## 1 0.016467404 0.35325964
## 2 0.008137865 0.20392532
## 3 0.002898280 0.02140987
## 4 0.011201843 0.84170854
## 5 0.018348624 0.59909988
## 6 0.003437607 0.01632864
## 7 0.028875602 0.11389821
## marital-status_Separated marital-status_Widowed occupation_Adm-clerical
## 1 0.050530115 0.009699977 0.12812993
## 2 0.018190522 0.022977501 0.08616563
## 3 0.007198953 0.002056844 0.05067315
## 4 0.031092965 0.001675042 0.17221524
## 5 0.035312446 0.018175524 0.13571058
## 6 0.004297009 0.006703334 0.04297009
## 7 0.079626659 0.146419717 0.22910894
## occupation_Armed-Forces occupation_Craft-repair occupation_Exec-managerial
## 1 0.0002255809 0.18069028 0.09925558
## 2 0.0004786979 0.12350407 0.20584011
## 3 0.0003739716 0.27262528 0.10835826
## 4 0.0005234506 0.10311977 0.04376047
## 5 0.0001731002 0.03375454 0.18590964
## 6 0.0003437607 0.04812650 0.29889997
## 7 0.0000000000 0.08531428 0.10558553
## occupation_Farming-fishing occupation_Handlers-cleaners
## 1 0.036318520 0.064967291
## 2 0.022020105 0.025370991
## 3 0.061237846 0.047307405
## 4 0.030569514 0.092964824
## 5 0.009520512 0.008481911
## 6 0.016844276 0.006703334
## 7 0.025229692 0.032521511
## occupation_Machine-op-inspct occupation_Other-service
## 1 0.076923077 0.10805324
## 2 0.046912398 0.05026328
## 3 0.098915482 0.05244951
## 4 0.078726968 0.21733668
## 5 0.012982517 0.04621776
## 6 0.009797181 0.01151598
## 7 0.085897623 0.18273297
## occupation_Priv-house-serv occupation_Prof-specialty
## 1 0.0049627792 0.06338822
## 2 0.0023934897 0.19913834
## 3 0.0004674645 0.03225505
## 4 0.0072236181 0.03297739
## 5 0.0017310023 0.37995499
## 6 0.0003437607 0.36026126
## 7 0.0173545282 0.05250109
## occupation_Protective-serv occupation_Sales occupation_Tech-support
## 1 0.02571622 0.1071509 0.03789759
## 2 0.02154141 0.1335567 0.03494495
## 3 0.03281601 0.1055535 0.02468212
## 4 0.01612228 0.1439489 0.02523032
## 5 0.01505972 0.1087069 0.05314177
## 6 0.02062564 0.1431763 0.02956342
## 7 0.01531282 0.1004813 0.02843809
## occupation_Transport-moving relationship_Husband relationship_Not-in-family
## 1 0.066320776 0.36160614 0.29934582
## 2 0.047869794 0.57539493 0.22642413
## 3 0.112284966 0.92492521 0.03767764
## 4 0.035280570 0.00889866 0.30067002
## 5 0.008655011 0.01107841 0.59633028
## 6 0.010828463 0.91148161 0.04314197
## 7 0.039521657 0.07525157 0.42744641
## relationship_Other-relative relationship_Own-child relationship_Unmarried
## 1 0.042860365 0.128806677 0.12519738
## 2 0.017711824 0.070368597 0.04882719
## 3 0.006637996 0.009910247 0.01271503
## 4 0.064593802 0.495917085 0.10113065
## 5 0.022676129 0.137441579 0.14211528
## 6 0.002578206 0.003953249 0.01546923
## 7 0.042000875 0.036167420 0.30888144
## relationship_Wife race_Amer-Indian-Eskimo race_Asian-Pac-Islander race_Black
## 1 0.042183623 0.004286037 0.01443718 0.19016467
## 2 0.061273337 0.004786979 0.03015797 0.06414552
## 3 0.008133882 0.011686612 0.01832461 0.04870980
## 4 0.028789782 0.014028476 0.02721943 0.11086683
## 5 0.090358317 0.008481911 0.05123767 0.08343431
## 6 0.023375730 0.002406325 0.04726710 0.03093847
## 7 0.110252297 0.012250255 0.02172962 0.14714890
## race_Other race_White sex_Female sex_Male native-country_Cambodia
## 1 0.009474397 0.7816377 0.26370404 0.7362960 0.0011279044
## 2 0.005265677 0.8956438 0.22355194 0.7764481 0.0009573959
## 3 0.006544503 0.9147345 0.01327599 0.9867240 0.0007479432
## 4 0.013295645 0.8345896 0.46702261 0.5329774 0.0005234506
## 5 0.006577809 0.8502683 0.60083088 0.3991691 0.0001731002
## 6 0.003953249 0.9154349 0.03454795 0.9654520 0.0003437607
## 7 0.006125128 0.8127461 0.69768120 0.3023188 0.0004375091
## native-country_Canada native-country_China native-country_Columbia
## 1 0.001804647 0.0015790661 0.0011279044
## 2 0.005744375 0.0052656774 0.0014360938
## 3 0.002711294 0.0015893792 0.0018698579
## 4 0.001989112 0.0009422111 0.0026172529
## 5 0.004327506 0.0043275056 0.0010386014
## 6 0.006359574 0.0056720523 0.0008594019
## 7 0.004812600 0.0016042001 0.0026250547
## native-country_Cuba native-country_Dominican-Republic native-country_Ecuador
## 1 0.002030228 0.0022558087 0.0000000000
## 2 0.002393490 0.0009573959 0.0000000000
## 3 0.003178758 0.0020568437 0.0010284218
## 4 0.001046901 0.0030360134 0.0017797320
## 5 0.003462005 0.0010386014 0.0003462005
## 6 0.002750086 0.0006875215 0.0008594019
## 7 0.005687618 0.0035000729 0.0011666910
## native-country_El-Salvador native-country_England native-country_France
## 1 0.0072185879 0.002481390 0.0006767426
## 2 0.0009573959 0.003350886 0.0004786979
## 3 0.0023373224 0.001308901 0.0003739716
## 4 0.0059673367 0.001570352 0.0002093802
## 5 0.0003462005 0.004327506 0.0013848018
## 6 0.0010312822 0.004640770 0.0020625645
## 7 0.0033542365 0.002916727 0.0008750182
## native-country_Germany native-country_Greece native-country_Guatemala
## 1 0.004511617 0.0002255809 0.0049627792
## 2 0.003350886 0.0019147918 0.0004786979
## 3 0.003178758 0.0020568437 0.0017763650
## 4 0.003559464 0.0005234506 0.0032453936
## 5 0.006750909 0.0010386014 0.0003462005
## 6 0.005843933 0.0008594019 0.0000000000
## 7 0.003645909 0.0008750182 0.0016042001
## native-country_Haiti native-country_Holand-Netherlands
## 1 0.0018046470 0.0000000000
## 2 0.0014360938 0.0004786979
## 3 0.0013089005 0.0000000000
## 4 0.0014656616 0.0000000000
## 5 0.0008655011 0.0000000000
## 6 0.0006875215 0.0000000000
## 7 0.0030625638 0.0000000000
## native-country_Honduras native-country_Hong native-country_Hungary
## 1 0.0006767426 0.0004511617 0.0004511617
## 2 0.0009573959 0.0019147918 0.0004786979
## 3 0.0000000000 0.0002804787 0.0004674645
## 4 0.0007328308 0.0005234506 0.0000000000
## 5 0.0000000000 0.0008655011 0.0008655011
## 6 0.0001718804 0.0008594019 0.0003437607
## 7 0.0008750182 0.0005833455 0.0004375091
## native-country_India native-country_Iran native-country_Ireland
## 1 0.0015790661 0.0006767426 0.0000000000
## 2 0.0043082815 0.0019147918 0.0009573959
## 3 0.0009349289 0.0004674645 0.0008414361
## 4 0.0017797320 0.0008375209 0.0012562814
## 5 0.0055392072 0.0017310023 0.0010386014
## 6 0.0116878652 0.0037813682 0.0005156411
## 7 0.0005833455 0.0005833455 0.0005833455
## native-country_Italy native-country_Jamaica native-country_Japan
## 1 0.0011279044 0.0022558087 0.0027069704
## 2 0.0014360938 0.0004786979 0.0014360938
## 3 0.0036462229 0.0018698579 0.0008414361
## 4 0.0008375209 0.0032453936 0.0017797320
## 5 0.0012117016 0.0025965034 0.0015579020
## 6 0.0029219663 0.0010312822 0.0051564111
## 7 0.0030625638 0.0029167274 0.0013125273
## native-country_Laos native-country_Mexico native-country_Nicaragua
## 1 0.0004511617 0.067223099 0.0024813896
## 2 0.0004786979 0.007659167 0.0009573959
## 3 0.0005609574 0.021783844 0.0005609574
## 4 0.0005234506 0.023869347 0.0018844221
## 5 0.0005193007 0.004673706 0.0010386014
## 6 0.0001718804 0.003265727 0.0000000000
## 7 0.0004375091 0.011958582 0.0007291819
## native-country_Outlying-US(Guam-USVI-etc) native-country_Peru
## 1 0.0002255809 0.0027069704
## 2 0.0000000000 0.0004786979
## 3 0.0003739716 0.0003739716
## 4 0.0006281407 0.0009422111
## 5 0.0008655011 0.0010386014
## 6 0.0000000000 0.0005156411
## 7 0.0008750182 0.0014583637
## native-country_Philippines native-country_Poland native-country_Portugal
## 1 0.003609294 0.0009023235 0.0011279044
## 2 0.005744375 0.0009573959 0.0000000000
## 3 0.003833209 0.0022438295 0.0028047868
## 4 0.005653266 0.0010469012 0.0011515913
## 5 0.012290116 0.0022503029 0.0005193007
## 6 0.009109660 0.0015469233 0.0001718804
## 7 0.005250109 0.0027708911 0.0017500365
## native-country_Puerto-Rico native-country_Scotland native-country_South
## 1 0.002481390 0.0002255809 0.0009023235
## 2 0.003829584 0.0000000000 0.0028721876
## 3 0.003646223 0.0002804787 0.0018698579
## 4 0.004606365 0.0002093802 0.0018844221
## 5 0.001904102 0.0006924009 0.0036351047
## 6 0.001375043 0.0005156411 0.0030938467
## 7 0.007875164 0.0010208546 0.0020417092
## native-country_Taiwan native-country_Thailand native-country_Trinadad&Tobago
## 1 0.0009023235 0.0002255809 0.0004511617
## 2 0.0014360938 0.0004786979 0.0009573959
## 3 0.0003739716 0.0006544503 0.0006544503
## 4 0.0001046901 0.0005234506 0.0005234506
## 5 0.0041544054 0.0013848018 0.0003462005
## 6 0.0032657271 0.0006875215 0.0003437607
## 7 0.0000000000 0.0004375091 0.0008750182
## native-country_United-States native-country_Vietnam native-country_Yugoslavia
## 1 0.8725468 0.001127904 0.0006767426
## 2 0.9291527 0.001914792 0.0000000000
## 3 0.9233358 0.001308901 0.0004674645
## 4 0.9131072 0.003350084 0.0005234506
## 5 0.9177774 0.001731002 0.0000000000
## 6 0.9145755 0.001375043 0.0008594019
## 7 0.9132274 0.001458364 0.0007291819
# Run k-means with 7 clusters
kclust <- kmeans(income_scaled, centers = 7)
# Print the cluster centers
print(kclust$centers)
## age fnlwgt education-num capital-gain capital-loss hours-per-week
## 1 0.2874778 -0.18627993 1.29958173 0.07519984 -0.2109336 0.41456137
## 2 0.8019253 -0.29045945 -0.16325750 -0.09231062 -0.2019152 -0.37120370
## 3 0.5994668 -0.04106179 1.16295118 13.17504707 -0.2187778 0.82779143
## 4 0.5371752 0.03452284 -2.12261249 -0.11319310 -0.1725671 -0.07098081
## 5 -1.0358195 -0.21965650 -0.14741514 -0.13023843 -0.1820534 -0.56375149
## 6 0.2693752 -0.29603331 -0.14471189 -0.07536540 0.6248841 0.47551967
## 7 -0.3425121 1.87062723 -0.08047516 -0.09162742 -0.1630693 0.04456575
## income_above_50k workclass_Federal-gov workclass_Local-gov workclass_Private
## 1 0.57054136 0.047340052 0.12010780 0.5823764
## 2 0.09537167 0.040112202 0.07671809 0.7349229
## 3 1.00000000 0.008733624 0.02620087 0.5240175
## 4 0.06857143 0.006666667 0.04507937 0.8088889
## 5 0.01426454 0.015376065 0.03936643 0.8699518
## 6 0.39912738 0.032912833 0.06184198 0.6877549
## 7 0.17603306 0.037190083 0.06260331 0.7822314
## workclass_Self-emp-inc workclass_Self-emp-not-inc workclass_State-gov
## 1 0.068549332 0.10346848 0.07792360
## 2 0.021178121 0.08078541 0.04544180
## 3 0.248908297 0.17467249 0.01746725
## 4 0.016507937 0.10761905 0.01460317
## 5 0.009633197 0.02908485 0.03603186
## 6 0.056909798 0.12880584 0.03139524
## 7 0.020041322 0.05909091 0.03863636
## workclass_Without-pay education_1st-4th education_5th-6th education_7th-8th
## 1 0.0002343567 0.000000000 0.0000000000 0.0000000000
## 2 0.0008415147 0.000000000 0.0000000000 0.0001402525
## 3 0.0000000000 0.000000000 0.0043668122 0.0000000000
## 4 0.0006349206 0.067936508 0.1346031746 0.2485714286
## 5 0.0005557614 0.000000000 0.0000926269 0.0011115228
## 6 0.0003793987 0.000000000 0.0005690980 0.0011381960
## 7 0.0002066116 0.001652893 0.0035123967 0.0030991736
## education_9th education_10th education_11th education_12th
## 1 0.000000000 0.000000000 0.00000000 0.000000000
## 2 0.001122020 0.010098177 0.02398317 0.012061711
## 3 0.004366812 0.008733624 0.00000000 0.000000000
## 4 0.166031746 0.215555556 0.13619048 0.008253968
## 5 0.007873286 0.029177473 0.06937755 0.025935532
## 6 0.001233046 0.004268235 0.01071801 0.010148914
## 7 0.009504132 0.022727273 0.03243802 0.016115702
## education_Assoc-acdm education_Assoc-voc education_Bachelors
## 1 0.06456527 0.01359269 0.56808062
## 2 0.04011220 0.05946704 0.04908836
## 3 0.00000000 0.01310044 0.24890830
## 4 0.00000000 0.00000000 0.00000000
## 5 0.03436458 0.04362727 0.11532049
## 6 0.00834677 0.06696386 0.04334630
## 7 0.04359504 0.04938017 0.12665289
## education_Doctorate education_HS-grad education_Masters education_Preschool
## 1 0.0526130771 0.0000000 0.22990391 0.00000000
## 2 0.0004207574 0.5086957 0.01472651 0.00000000
## 3 0.1004366812 0.1397380 0.12227074 0.00000000
## 4 0.0000000000 0.0000000 0.00000000 0.02285714
## 5 0.0003705076 0.3408670 0.01018896 0.00000000
## 6 0.0051218818 0.5275538 0.01934933 0.00000000
## 7 0.0022727273 0.3888430 0.02169421 0.00000000
## education_Prof-school education_Some-college marital-status_Divorced
## 1 0.0707757206 0.0004687134 0.10827279
## 2 0.0009817672 0.2791023843 0.48751753
## 3 0.2925764192 0.0655021834 0.08733624
## 4 0.0000000000 0.0000000000 0.10412698
## 5 0.0010188959 0.3206743238 0.04844387
## 6 0.0082519207 0.2929906099 0.03329223
## 7 0.0018595041 0.2766528926 0.13946281
## marital-status_Married-AF-spouse marital-status_Married-civ-spouse
## 1 0.0007030701 0.67190063
## 2 0.0004207574 0.17138850
## 3 0.0043668122 0.79039301
## 4 0.0003174603 0.59650794
## 5 0.0010188959 0.04473879
## 6 0.0004742483 0.91027222
## 7 0.0010330579 0.40475207
## marital-status_Married-spouse-absent marital-status_Never-married
## 1 0.010663229 0.17951723
## 2 0.024123422 0.11514727
## 3 0.004366812 0.08733624
## 4 0.029523810 0.16476190
## 5 0.009447944 0.87087810
## 6 0.002560941 0.04078536
## 7 0.013636364 0.38719008
## marital-status_Separated marital-status_Widowed occupation_Adm-clerical
## 1 0.017225217 0.011717835 0.04909773
## 2 0.075736325 0.125666199 0.25568022
## 3 0.017467249 0.008733624 0.03493450
## 4 0.050476190 0.054285714 0.02857143
## 5 0.023990367 0.001482030 0.18117821
## 6 0.007872522 0.004742483 0.05804799
## 7 0.045247934 0.008677686 0.13057851
## occupation_Armed-Forces occupation_Craft-repair occupation_Exec-managerial
## 1 0.0002343567 0.04054371 0.26658074
## 2 0.0000000000 0.08359046 0.12061711
## 3 0.0000000000 0.06550218 0.27074236
## 4 0.0000000000 0.21650794 0.03015873
## 5 0.0004631345 0.09040385 0.06113375
## 6 0.0003793987 0.24679882 0.14142085
## 7 0.0006198347 0.16590909 0.11177686
## occupation_Farming-fishing occupation_Handlers-cleaners
## 1 0.01453011 0.006444809
## 2 0.01767181 0.026647966
## 3 0.01310044 0.008733624
## 4 0.09587302 0.088253968
## 5 0.02426825 0.079937014
## 6 0.05036517 0.036991369
## 7 0.02727273 0.055371901
## occupation_Machine-op-inspct occupation_Other-service
## 1 0.009842981 0.01699086
## 2 0.067321178 0.15385694
## 3 0.004366812 0.01310044
## 4 0.165396825 0.17333333
## 5 0.064005187 0.19340496
## 6 0.080906763 0.04287205
## 7 0.070247934 0.09855372
## occupation_Priv-house-serv occupation_Prof-specialty
## 1 0.0005858917 0.407077572
## 2 0.0119214586 0.072650771
## 3 0.0000000000 0.427947598
## 4 0.0212698413 0.008571429
## 5 0.0049092256 0.071415339
## 6 0.0005690980 0.066015366
## 7 0.0033057851 0.087603306
## occupation_Protective-serv occupation_Sales occupation_Tech-support
## 1 0.018162644 0.12713850 0.031989688
## 2 0.014305750 0.10729313 0.035483871
## 3 0.008733624 0.14410480 0.004366812
## 4 0.011111111 0.04507937 0.003492063
## 5 0.016858096 0.14672101 0.035383475
## 6 0.034051029 0.11723418 0.029593095
## 7 0.029132231 0.11632231 0.038842975
## occupation_Transport-moving relationship_Husband relationship_Not-in-family
## 1 0.010780408 0.605226154 0.24466839
## 2 0.032959327 0.055820477 0.43281907
## 3 0.004366812 0.733624454 0.15720524
## 4 0.112380952 0.545396825 0.20444444
## 5 0.029918488 0.009818451 0.33892182
## 6 0.094754814 0.890922887 0.05890164
## 7 0.064462810 0.354958678 0.32396694
## relationship_Other-relative relationship_Own-child relationship_Unmarried
## 1 0.009491446 0.02308413 0.05483947
## 2 0.035343619 0.03927069 0.32650771
## 3 0.000000000 0.01746725 0.03056769
## 4 0.058412698 0.04444444 0.11142857
## 5 0.054464617 0.48545758 0.08493887
## 6 0.006260078 0.01327895 0.01536565
## 7 0.036776860 0.12892562 0.11466942
## relationship_Wife race_Amer-Indian-Eskimo race_Asian-Pac-Islander race_Black
## 1 0.06269041 0.004335599 0.04698852 0.04171549
## 2 0.11023843 0.013043478 0.02145863 0.13688640
## 3 0.06113537 0.000000000 0.04803493 0.03930131
## 4 0.03587302 0.009841270 0.02317460 0.10984127
## 5 0.02639867 0.012782512 0.03297518 0.10809559
## 6 0.01527080 0.010623162 0.02115147 0.04429479
## 7 0.04070248 0.004958678 0.01776860 0.18739669
## race_Other race_White sex_Female sex_Male native-country_Cambodia
## 1 0.004687134 0.9022733 0.20412468 0.7958753 0.0003515350
## 2 0.004207574 0.8244039 0.72776999 0.2722300 0.0002805049
## 3 0.013100437 0.8995633 0.14410480 0.8558952 0.0000000000
## 4 0.024444444 0.8326984 0.20539683 0.7946032 0.0009523810
## 5 0.011300482 0.8348462 0.49712857 0.5028714 0.0004631345
## 6 0.004457934 0.9194726 0.03964716 0.9603528 0.0006639476
## 7 0.007024793 0.7828512 0.26838843 0.7316116 0.0012396694
## native-country_Canada native-country_China native-country_Columbia
## 1 0.005741739 0.005507382 0.001171783
## 2 0.004488079 0.001122020 0.002244039
## 3 0.004366812 0.004366812 0.000000000
## 4 0.003174603 0.004126984 0.004761905
## 5 0.002408299 0.001296777 0.001945165
## 6 0.003319738 0.001991843 0.001327895
## 7 0.002066116 0.001859504 0.001239669
## native-country_Cuba native-country_Dominican-Republic native-country_Ecuador
## 1 0.003046637 0.0005858917 0.0008202484
## 2 0.004488079 0.0015427770 0.0007012623
## 3 0.000000000 0.0043668122 0.0000000000
## 4 0.009523810 0.0126984127 0.0019047619
## 5 0.001389403 0.0018525380 0.0015746573
## 6 0.001991843 0.0011381960 0.0007587973
## 7 0.001859504 0.0016528926 0.0000000000
## native-country_El-Salvador native-country_England native-country_France
## 1 0.0009374268 0.005038669 0.0018748535
## 2 0.0012622721 0.002945302 0.0009817672
## 3 0.0000000000 0.000000000 0.0000000000
## 4 0.0196825397 0.001269841 0.0003174603
## 5 0.0027788070 0.001945165 0.0005557614
## 6 0.0014227449 0.001707294 0.0002845490
## 7 0.0047520661 0.002479339 0.0006198347
## native-country_Germany native-country_Greece native-country_Guatemala
## 1 0.005858917 0.0010546051 0.0000000000
## 2 0.004628331 0.0005610098 0.0005610098
## 3 0.000000000 0.0000000000 0.0000000000
## 4 0.002222222 0.0019047619 0.0133333333
## 5 0.003334568 0.0005557614 0.0013894035
## 6 0.003888836 0.0021815423 0.0006639476
## 7 0.005371901 0.0002066116 0.0037190083
## native-country_Haiti native-country_Holand-Netherlands
## 1 0.0007030701 0.000000e+00
## 2 0.0021037868 0.000000e+00
## 3 0.0000000000 0.000000e+00
## 4 0.0050793651 0.000000e+00
## 5 0.0013894035 0.000000e+00
## 6 0.0007587973 9.484966e-05
## 7 0.0018595041 0.000000e+00
## native-country_Honduras native-country_Hong native-country_Hungary
## 1 0.0001171783 0.0007030701 0.0005858917
## 2 0.0007012623 0.0004207574 0.0007012623
## 3 0.0000000000 0.0000000000 0.0000000000
## 4 0.0015873016 0.0012698413 0.0000000000
## 5 0.0003705076 0.0004631345 0.0000926269
## 6 0.0001896993 0.0005690980 0.0004742483
## 7 0.0004132231 0.0008264463 0.0004132231
## native-country_India native-country_Iran native-country_Ireland
## 1 0.0093742676 0.0038668854 0.0007030701
## 2 0.0007012623 0.0005610098 0.0005610098
## 3 0.0174672489 0.0000000000 0.0000000000
## 4 0.0015873016 0.0000000000 0.0006349206
## 5 0.0025935532 0.0007410152 0.0013894035
## 6 0.0015175946 0.0007587973 0.0008536470
## 7 0.0018595041 0.0006198347 0.0000000000
## native-country_Italy native-country_Jamaica native-country_Japan
## 1 0.0024607453 0.001523318 0.0039840637
## 2 0.0021037868 0.002945302 0.0016830295
## 3 0.0000000000 0.000000000 0.0043668122
## 4 0.0088888889 0.002857143 0.0006349206
## 5 0.0008336421 0.002964061 0.0015746573
## 6 0.0018021436 0.001517595 0.0011381960
## 7 0.0016528926 0.002479339 0.0022727273
## native-country_Laos native-country_Mexico native-country_Nicaragua
## 1 0.0002343567 0.002929459 0.0005858917
## 2 0.0002805049 0.004768583 0.0004207574
## 3 0.0000000000 0.008733624 0.0000000000
## 4 0.0015873016 0.147936508 0.0022222222
## 5 0.0003705076 0.010837347 0.0015746573
## 6 0.0004742483 0.006544627 0.0005690980
## 7 0.0006198347 0.039256198 0.0020661157
## native-country_Outlying-US(Guam-USVI-etc) native-country_Peru
## 1 0.0003515350 0.0005858917
## 2 0.0011220196 0.0012622721
## 3 0.0000000000 0.0000000000
## 4 0.0006349206 0.0009523810
## 5 0.0004631345 0.0009262690
## 6 0.0001896993 0.0004742483
## 7 0.0004132231 0.0026859504
## native-country_Philippines native-country_Poland native-country_Portugal
## 1 0.009725803 0.001640497 0.0002343567
## 2 0.005610098 0.002664797 0.0007012623
## 3 0.013100437 0.000000000 0.0000000000
## 4 0.006349206 0.002222222 0.0088888889
## 5 0.006854391 0.001204150 0.0008336421
## 6 0.004078536 0.002086693 0.0013278953
## 7 0.004132231 0.001239669 0.0008264463
## native-country_Puerto-Rico native-country_Scotland native-country_South
## 1 0.001640497 0.0007030701 0.0032809937
## 2 0.004628331 0.0008415147 0.0022440393
## 3 0.000000000 0.0000000000 0.0000000000
## 4 0.014603175 0.0003174603 0.0003174603
## 5 0.003797703 0.0002778807 0.0024082994
## 6 0.002940340 0.0002845490 0.0023712416
## 7 0.002066116 0.0002066116 0.0010330579
## native-country_Taiwan native-country_Thailand native-country_Trinadad&Tobago
## 1 0.0036325287 0.0010546051 0.0003515350
## 2 0.0001402525 0.0007012623 0.0005610098
## 3 0.0043668122 0.0000000000 0.0000000000
## 4 0.0000000000 0.0003174603 0.0015873016
## 5 0.0008336421 0.0006483883 0.0004631345
## 6 0.0006639476 0.0006639476 0.0006639476
## 7 0.0012396694 0.0000000000 0.0004132231
## native-country_United-States native-country_Vietnam native-country_Yugoslavia
## 1 0.9150457 0.0014061401 0.0005858917
## 2 0.9345021 0.0009817672 0.0002805049
## 3 0.9388646 0.0000000000 0.0000000000
## 4 0.7092063 0.0034920635 0.0009523810
## 5 0.9314561 0.0026861801 0.0004631345
## 6 0.9437541 0.0014227449 0.0004742483
## 7 0.9018595 0.0018595041 0.0006198347
##Outliers lof
library(dbscan)
lof <- lof(income, minPts = 10)
summary(lof)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8445 0.9905 1.0202 1.2133 1.1030 49.1007
hist(lof, breaks = 10, main = "LOF (minPts = 10)")
plot(sort(lof), type = "l", main = "LOF (minPts = 10)",
xlab = "Points sorted by LOF", ylab = "LOF")
#capital gain/loss
plot(select(income, c("capital-gain", "capital-loss")), pch = ".", main = "LOF (minPts = 10)", asp = 1)
points(select(income, c("capital-gain", "capital-loss")), cex = (lof - 1) * 4, pch = 1, col = "red")
text(income[lof > 1.3,], labels = round(lof, 1)[lof > 1.3], pos = 3)
#education age
plot(select(income, c("education-num", "age")), pch = ".", main = "LOF (minPts = 10)", asp = 1)
points(select(income, c("education-num", "age")), cex = (lof - 1) * 4, pch = 1, col = "red")
text(income[lof > 1.3,], labels = round(lof, 1)[lof > 1.3], pos = 3)
##Outliers isolation forest
library(isotree)
model = isolation.forest(income, ndim=1, ntrees=10)
scores = predict(model, income, type="score")
hist(scores, breaks = 10, main = "IF Scores")
plot(sort(scores), type = "l", main = "IF Scores",
xlab = "Points sorted by score", ylab = "IF score")
plot(select(income, c(`hours-per-week`, `education-num`)), pch = ".", main = "IF Scores", asp = 1)
points(select(income, c(`hours-per-week`, `education-num`))[scores > 0.5,], cex = as.data.frame(scores)[scores > 0.5,])
##Model Kmeans CLusters Kappa .57
#income$income_above_50k <- as.factor(income$income_above_50k)
incomek$income_above_50k <- factor(incomek$income_above_50k, levels = c(FALSE, TRUE), labels = c("Below_50K", "Above_50K"))
#timer
start_time <- Sys.time()
# specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
ctrl <- trainControl(method = "cv", number = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
set.seed(504)
income_index <- createDataPartition(incomek$income_above_50k, p = 0.80, list = FALSE)
train <- incomek[ income_index, ]
test <- incomek[-income_index, ]
# example spec for rf
fit <- train(income_above_50k ~ .,
data = train,
method = "rf",
ntree = 20,
tuneLength = 3,
metric = "ROC",
trControl = ctrl)
fit
## Random Forest
##
## 36179 samples
## 105 predictor
## 2 classes: 'Below_50K', 'Above_50K'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 24119, 24120, 24119
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8643839 0.9677348 0.3701349
## 54 0.8989395 0.9249227 0.6266310
## 106 0.8949022 0.9209539 0.6294190
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 54.
confusionMatrix(predict(fit, test),factor(test$income_above_50k))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Below_50K Above_50K
## Below_50K 6278 847
## Above_50K 524 1394
##
## Accuracy : 0.8484
## 95% CI : (0.8408, 0.8557)
## No Information Rate : 0.7522
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5727
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9230
## Specificity : 0.6220
## Pos Pred Value : 0.8811
## Neg Pred Value : 0.7268
## Prevalence : 0.7522
## Detection Rate : 0.6942
## Detection Prevalence : 0.7879
## Balanced Accuracy : 0.7725
##
## 'Positive' Class : Below_50K
##
#end timer
end_time <- Sys.time()
time_taken <- end_time - start_time
print(time_taken)
## Time difference of 41.49416 secs
print(as.numeric(time_taken, units = "mins"))
## [1] 0.6915694
##Model kappa .57 with no feature engineering
#income$income_above_50k <- as.factor(income$income_above_50k)
income$income_above_50k <- factor(income$income_above_50k, levels = c(FALSE, TRUE), labels = c("Below_50K", "Above_50K"))
#timer
start_time <- Sys.time()
# specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
ctrl <- trainControl(method = "cv", number = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
set.seed(504)
income_index <- createDataPartition(income$income_above_50k, p = 0.80, list = FALSE)
train <- income[ income_index, ]
test <- income[-income_index, ]
# example spec for rf
fit <- train(income_above_50k ~ .,
data = train,
method = "rf",
ntree = 20,
tuneLength = 3,
metric = "ROC",
trControl = ctrl)
fit
## Random Forest
##
## 36179 samples
## 104 predictor
## 2 classes: 'Below_50K', 'Above_50K'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 24119, 24120, 24119
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8618802 0.9841979 0.2572767
## 53 0.8939580 0.9256209 0.6237315
## 104 0.8902885 0.9217991 0.6268540
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 53.
confusionMatrix(predict(fit, test),factor(test$income_above_50k))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Below_50K Above_50K
## Below_50K 6277 849
## Above_50K 525 1392
##
## Accuracy : 0.8481
## 95% CI : (0.8405, 0.8554)
## No Information Rate : 0.7522
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5717
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9228
## Specificity : 0.6212
## Pos Pred Value : 0.8809
## Neg Pred Value : 0.7261
## Prevalence : 0.7522
## Detection Rate : 0.6941
## Detection Prevalence : 0.7880
## Balanced Accuracy : 0.7720
##
## 'Positive' Class : Below_50K
##
#end timer
end_time <- Sys.time()
time_taken <- end_time - start_time
print(time_taken)
## Time difference of 40.84666 secs
print(as.numeric(time_taken, units = "mins"))
## [1] 0.6807777
##EDA
library(ggcorrplot)
library(ggplot2)
library(dplyr)
raw_income = read_csv("./openml_1590.csv", na=c("?"))
income2 = read_csv("./openml_1590.csv", na=c("?")) %>%
drop_na() %>%
mutate(income_above_50k = class==">50K") %>%
select(-class)
##income2$income_above_50k <- as.numeric(factor(income2$income_above_50k, levels = unique(income2$income_above_50k))) - 1
income2$income_above_50k <- as.factor(income2$income_above_50k)
##try2
income2$income_above_50k <- as.numeric(income2$income_above_50k) - 1
#str(income2)
# Set the figure size for the plots
library(ggplot2)
options(repr.plot.width = 20, repr.plot.height = 12)
# Histograms for selected columns
hist_cols <- c("age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week")
par(mfrow=c(2, 3))
for (col in hist_cols) {
hist(income2[[col]], main=col, xlab=col, col="lightblue")
}
####Correlation Matrix Heatmap
# Select the columns you want to include in the correlation matrix
selected_columns <- c("age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week", "income_above_50k")
# Calculate the correlation matrix for the selected columns
corr_matrix <- cor(income2[, selected_columns])
# Create a heatmap of the correlation matrix with correlation coefficients
ggcorrplot(corr_matrix, lab = TRUE, lab_size = 4, tl.cex = 12, tl.col = "black", tl.srt = 45)
##Bucket Age
# Bin the 'age' column
income2 <- income2 %>%
mutate(age_group = case_when(
age >= 0 & age <= 25 ~ "young",
age > 25 & age <= 65 ~ "prime",
age > 65 & age <= 100 ~ "retired"
)) %>%
select(-age)
# Reorder the age_group factor levels
income2$age_group <- factor(income2$age_group, levels = c("young", "prime", "retired"))
# Create the count plot with 'income_above_50k' as the hue
ggplot(income2, aes(x = age_group, fill = as.factor(income_above_50k))) +
geom_bar(position = "dodge") +
labs(x = "Age Group", y = "Count", fill = "Income Above 50K") +
theme_minimal()
##Capital Diff
#check if factor
##class(income2$income_above_50k)
# Convert 'income_above_50k' to a factor
##income2$income_above_50k <- as.factor(income2$income_above_50k)
##see above for same in EDA now
# Create 'Capital Diff' column and remove 'Capital Gain' and 'Capital Loss' columns
income2 <- income2 %>%
mutate(capital_diff = `capital-gain` - `capital-loss`) %>%
select(-`capital-gain`, -`capital-loss`)
# Bin the 'Capital Diff' column
income2 <- income2 %>%
mutate(capital_diff = case_when(
capital_diff >= -5000 & capital_diff <= 5000 ~ "Minor",
capital_diff > 5000 & capital_diff <= 100000 ~ "Major"
))
# Create the count plot with 'Income' as the hue
ggplot(income2, aes(x = capital_diff, fill = income_above_50k)) +
geom_bar(position = "dodge") +
labs(x = "Capital Diff", y = "Count", fill = "Income") +
theme_minimal()
##Drop Columns and without pay
income2 <- income2 %>% select(-fnlwgt)
unique(income2$workclass)
## [1] "Private" "Local-gov" "Self-emp-not-inc" "Federal-gov"
## [5] "State-gov" "Self-emp-inc" "Without-pay"
income2 <- income2 %>%
filter(workclass != "Without-pay")
##Bin Hours
# Bin 'Hours per Week' column
income2 <- income2 %>%
mutate(hours_per_week = case_when(
`hours-per-week` >= 0 & `hours-per-week` <= 32 ~ "part_time",
`hours-per-week` > 32 & `hours-per-week` <= 40 ~ "full_time",
`hours-per-week` > 40 & `hours-per-week` <= 100 ~ "overtime"
)) %>%
select(-`hours-per-week`)
#Reorder hours per week
income2$hours_per_week <- factor(income2$hours_per_week, levels = c("part_time", "full_time", "overtime"))
# Create count plot with 'Income' as the hue
ggplot(income2, aes(x = hours_per_week, fill = income_above_50k)) +
geom_bar(position = "dodge") +
labs(x = "Hours per Week", y = "Count", fill = "Income") +
theme_minimal()
##Looking at Education
ggplot(raw_income, aes(x = education, fill = class)) +
geom_bar(position = "dodge") +
labs(x = "Education", y = "Count", fill = "Income") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Education same as education-num
income2 %>%
distinct(education, `education-num`) %>%
arrange(`education-num`) %>%
mutate(display = paste0("For ", education, ", the Education Number is ", `education-num`)) %>%
pull(display) %>%
print()
## [1] "For Preschool, the Education Number is 1"
## [2] "For 1st-4th, the Education Number is 2"
## [3] "For 5th-6th, the Education Number is 3"
## [4] "For 7th-8th, the Education Number is 4"
## [5] "For 9th, the Education Number is 5"
## [6] "For 10th, the Education Number is 6"
## [7] "For 11th, the Education Number is 7"
## [8] "For 12th, the Education Number is 8"
## [9] "For HS-grad, the Education Number is 9"
## [10] "For Some-college, the Education Number is 10"
## [11] "For Assoc-voc, the Education Number is 11"
## [12] "For Assoc-acdm, the Education Number is 12"
## [13] "For Bachelors, the Education Number is 13"
## [14] "For Masters, the Education Number is 14"
## [15] "For Prof-school, the Education Number is 15"
## [16] "For Doctorate, the Education Number is 16"
##Final Education counts
income2 <- income2 %>%
select(-`education-num`) %>%
mutate(education = recode(education,
"11th" = "School",
"9th" = "School",
"7th-8th" = "School",
"5th-6th" = "School",
"10th" = "School",
"1st-4th" = "School",
"Preschool" = "School",
"12th" = "School"
))
education_counts <- income2 %>%
count(education) %>%
arrange(desc(n))
education_counts
##Occupation
# Calculate the count of 'income_above_50k == 1' for each occupation
occupation_order <- income2 %>%
filter(income_above_50k == 1) %>%
group_by(occupation) %>%
summarize(count = n()) %>%
arrange(-count) %>%
pull(occupation)
# Reorder the 'occupation' factor levels based on the calculated order
income2$occupation_ordered <- factor(income2$occupation, levels = occupation_order)
# Create the bar plot
ggplot(income2, aes(x = occupation_ordered, fill = as.factor(income_above_50k))) +
geom_bar(position = "dodge") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Occupation", y = "Count", fill = "Income Above 50K")
#remove occupation
income2 <- income2 %>%
select(-occupation)
##Race
ggplot(income2, aes(x = race, fill = as.factor(income_above_50k))) +
geom_bar(position = "dodge") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
labs(x = "Race", y = "Count", fill = "Income Above 50K")
unique(income2$race)
## [1] "Black" "White" "Other"
## [4] "Amer-Indian-Eskimo" "Asian-Pac-Islander"
income2 <- income2 %>%
mutate(race = recode(race,
"Black" = "Other",
"Asian-Pac-Islander" = "Other",
"Amer-Indian-Eskimo" = "Other",
"Other" = "Other"
))
##Sex vs Income
ggplot(income2, aes(x = sex, fill = as.factor(income_above_50k))) +
geom_bar(position = "dodge") +
theme_minimal() +
labs(x = "Sex", y = "Count", fill = "Income Above 50K")
##Country
#Count of adults from each country
country_count <- table(income2$`native-country`)
barplot(country_count, xlab = "Countries", ylab = "Count", main = "Total adults from each Country", las = 2)
# Replace all countries except the first one with 'Other'
countries <- unique(income2$`native-country`)
income2$`native-country` <- ifelse(income2$`native-country` %in% countries[-1], "Other", income2$`native-country`)
#Country vs Income
ggplot(income2, aes(y = `native-country`, fill = as.factor(income_above_50k))) +
geom_bar(position = "dodge") +
theme_minimal() +
labs(y = "Country", x = "Count", fill = "Income Above 50K")
##Create dummy variables
income2_dummies <- income2 %>%
dummy_cols(select_columns = setdiff(names(income2), "income_above_50k"), remove_selected_columns = T)
##EDA pt2 (education smaller bins)
income3 <- income2 %>%
mutate(education = recode(education,
"School" = "Some_college_orless",
"Some-college" = "Some_college_orless",
"HS-grad" = "Some_college_orless",
"Assoc-voc" = "Some_college_orless",
"Assoc-acdm" = "Some_college_orless",
"Masters" = "Master_plus",
"Prof-school" = "Master_plus",
"Doctorate" = "Master_plus"
))
education_counts <- income3 %>%
count(education) %>%
arrange(desc(n))
education_counts
##Marital Status
#unique names for marital status
unique(income3$`marital-status`)
## [1] "Never-married" "Married-civ-spouse" "Widowed"
## [4] "Separated" "Divorced" "Married-spouse-absent"
## [7] "Married-AF-spouse"
#bin marital status
income3 <- income3 %>%
mutate(`marital_status` = recode(`marital-status`,
"Never-married" = "never",
"Married-civ-spouse" = "married",
"Widowed" = "not_married",
"Separated" = "not_married",
"Divorced" = "not_married",
"Married-spouse-absent" = "married",
"Married-AF-spouse" = "married"
)) %>%
select(-`marital-status`)
##Tuesday kappa.57 RF full dataset
#
# income = read_csv("./openml_1590.csv", na=c("?")) %>%
# drop_na() %>%
# mutate(income_above_50k = class==">50K") %>%
# select(-class) %>%
# dummy_cols(remove_selected_columns = T)
#
# #income$income_above_50k <- as.factor(income$income_above_50k)
# income$income_above_50k <- factor(income$income_above_50k, levels = c(FALSE, TRUE), labels = c("Below_50K", "Above_50K"))
#
#
# #timer
# start_time <- Sys.time()
#
#
# # specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
# ctrl <- trainControl(method = "cv", number = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
# set.seed(504)
#
# income_index <- createDataPartition(income$income_above_50k, p = 0.80, list = FALSE)
# train <- income[ income_index, ]
# test <- income[-income_index, ]
#
# #RF
# fit <- train(income_above_50k ~ .,
# data = train,
# method = "rf",
# ntree = 20,
# tuneLength = 3,
# metric = "ROC",
# trControl = ctrl)
#
# fit
#
# confusionMatrix(predict(fit, test),factor(test$income_above_50k))
#
#
# #end timer
# end_time <- Sys.time()
# time_taken <- end_time - start_time
# print(time_taken)
# print(as.numeric(time_taken, units = "mins"))
#
#
# # Naive Bayes
# fit_nb <- train(income_above_50k ~ .,
# data = train,
# method = "naive_bayes",
# metric = "ROC",
# trControl = ctrl)
#
# fit_nb
#
# confusionMatrix(predict(fit_nb, test), factor(test$income_above_50k))
#
# #error
# # KNN
# #fit_knn <- train(income_above_50k ~ .,
# # data = train,
# # method = "knn",
# # tuneLength = 10,
# # metric = "ROC",
# # trControl = ctrl)
#
# #fit_knn
#
# #confusionMatrix(predict(fit_knn, test), factor(test$income_above_50k))
#
# #error
# # K-Nearest Neighbors (KNN) with data normalization
# #fit_knn_normalized <- train(income_above_50k ~ .,
# # data = train,
# # method = "knn",
# # preProcess = "normalize", # Normalize the data
# # metric = "ROC",
# # trControl = ctrl)
#
# #fit_knn_normalized
#
# #confusionMatrix(predict(fit_knn_normalized, test), #factor(test$income_above_50k))
#
# #error
# #library(rpart)
# # Decision Tree (CART)
# #fit_cart <- train(income_above_50k ~ .,
# # data = train,
# # method = "rpart",
# # metric = "ROC",
# # trControl = ctrl)
#
# #fit_cart
#
# #confusionMatrix(predict(fit_cart, test), factor(test$income_above_50k))
#
#
# # SVM Support Vector Machine
# fit_svm <- train(income_above_50k ~ .,
# data = train,
# method = "svmRadial",
# tuneLength = 3,
# metric = "ROC",
# trControl = ctrl)
#
# fit_svm
#
# confusionMatrix(predict(fit_svm, test), factor(test$income_above_50k))
#
# # Logistic Regression Kappa .51
# fit_logreg <- train(income_above_50k ~ .,
# data = train,
# method = "glm",
# family = "binomial",
# metric = "ROC",
# trControl = ctrl)
#
# fit_logreg
#
# confusionMatrix(predict(fit_logreg, test), factor(test$income_above_50k))
#
# # Gradient Boosting Machine Kappa .51
# fit_gbm <- train(income_above_50k ~ .,
# data = train,
# method = "gbm",
# tuneLength = 3,
# metric = "ROC",
# trControl = ctrl)
#
# fit_gbm
#
# confusionMatrix(predict(fit_gbm, test), factor(test$income_above_50k))
#
# # XGBoost Kappa .53
# fit_xgb <- train(income_above_50k ~ .,
# data = train,
# method = "xgbTree",
# tuneLength = 3,
# metric = "ROC",
# trControl = ctrl)
#
# fit_xgb
#
# confusionMatrix(predict(fit_xgb, test), factor(test$income_above_50k))
#
##Multiple Accuracy measurements (Skip)
# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = defaultSummary)
#
# models <- list(
# rf = train(income_above_50k ~ ., data = train, method = "rf", metric = "Accuracy", trControl = ctrl),
# nb = train(income_above_50k ~ ., data = train, method = "nb", metric = "Accuracy", trControl = ctrl),
# knn = train(income_above_50k ~ ., data = train, method = "knn", metric = "Accuracy", trControl = ctrl),
# dt = train(income_above_50k ~ ., data = train, method = "rpart", metric = "Accuracy", trControl = ctrl)
# )
#
# results <- resamples(models)
# summary(results, metric = "Accuracy")
##Multiple Kappa (Skip)
# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = defaultSummary)
#
# models <- list(
# rf = train(income_above_50k ~ ., data = train, method = "rf", metric = "Kappa", trControl = ctrl),
# nb = train(income_above_50k ~ ., data = train, method = "nb", metric = "Kappa", trControl = ctrl),
# knn = train(income_above_50k ~ ., data = train, method = "knn", metric = "Kappa", trControl = ctrl),
# dt = train(income_above_50k ~ ., data = train, method = "rpart", metric = "Kappa", trControl = ctrl)
# )
#
# results <- resamples(models)
# summary(results, metric = "Kappa")
##Multiple AUC (Skip)
# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)
#
# # Train multiple models using different methods
# models <- list(
# rf = train(income_above_50k ~ ., data = train, method = "rf", metric = "ROC", trControl = ctrl),
# nb = train(income_above_50k ~ ., data = train, method = "nb", metric = "ROC", trControl = ctrl))#,
# #knn = train(income_above_50k ~ ., data = train, method = "knn", metric = "ROC", trControl = ctrl),
# #dt = train(income_above_50k ~ ., data = train, method = "rpart", metric = "ROC", trControl = ctrl)
# )
#
# # Evaluate the models using the chosen metrics
# results <- resamples(models)
#
# # Compare the models
# summary(results, metric = "ROC")
#
# #error
# prob <- predict(fit, newdata=test)
# pred <- ifelse(prob > 0.5, 1, 0)
# confusionMatrix(factor(pred),factor(test$income_above_50k))
# #error
# library(pROC)
# myRoc <- roc(test$income_above_50k, prob)
# plot(myRoc)
##Multiple F1 (Skip)
#
# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = prSummary)
#
# models <- list(
# rf = train(income_above_50k ~ ., data = train, method = "rf", metric = "F1", trControl = ctrl),
# nb = train(income_above_50k ~ ., data = train, method = "nb", metric = "F1", trControl = ctrl),
# knn = train(income_above_50k ~ ., data = train, method = "knn", metric = "F1", trControl = ctrl),
# dt = train(income_above_50k ~ ., data = train, method = "rpart", metric = "F1", trControl = ctrl)
# )
#
# results <- resamples(models)
# summary(results, metric = "F1")
##Tuesday pt2 EDA v1 variables kappa .41 RF
#
# income = read_csv("./openml_1590.csv", na=c("?")) %>%
# drop_na() %>%
# mutate(income_above_50k = class==">50K") %>%
# select(-class) #%>%
# #dummy_cols(remove_selected_columns = T)
#
# #income$income_above_50k <- as.factor(income$income_above_50k)
# income$income_above_50k <- factor(income$income_above_50k, levels = c(FALSE, TRUE), labels = c("Below_50K", "Above_50K"))
#
# #New adds
#
# # Bin the 'age' column
# income <- income %>%
# mutate(age_group = case_when(
# age >= 0 & age <= 25 ~ "young",
# age > 25 & age <= 65 ~ "prime",
# age > 65 & age <= 100 ~ "retired"
# )) %>%
# select(-age)
#
# # Create 'Capital Diff' column and remove 'Capital Gain' and 'Capital Loss' columns
# income <- income %>%
# mutate(capital_diff = `capital-gain` - `capital-loss`) %>%
# select(-`capital-gain`, -`capital-loss`)
#
# # Bin the 'Capital Diff' column
# income <- income %>%
# mutate(capital_diff = case_when(
# capital_diff >= -5000 & capital_diff <= 5000 ~ "Minor",
# capital_diff > 5000 & capital_diff <= 100000 ~ "Major"
# ))
#
# #drop Columns
# income <- income %>% select(-fnlwgt)
#
# income <- income %>%
# filter(workclass != "Without-pay")
#
# # Bin 'Hours per Week' column
# income <- income %>%
# mutate(hours_per_week = case_when(
# `hours-per-week` >= 0 & `hours-per-week` <= 32 ~ "part_time",
# `hours-per-week` > 32 & `hours-per-week` <= 40 ~ "full_time",
# `hours-per-week` > 40 & `hours-per-week` <= 100 ~ "overtime"
# )) %>%
# select(-`hours-per-week`)
#
# #Bin Education
# income <- income %>%
# select(-`education-num`) %>%
# mutate(education = recode(education,
# "11th" = "School",
# "9th" = "School",
# "7th-8th" = "School",
# "5th-6th" = "School",
# "10th" = "School",
# "1st-4th" = "School",
# "Preschool" = "School",
# "12th" = "School"
# ))
#
# #Race
# income <- income %>%
# mutate(race = recode(race,
# "Black" = "Other",
# "Asian-Pac-Islander" = "Other",
# "Amer-Indian-Eskimo" = "Other",
# "Other" = "Other"
# ))
#
# # Replace all countries except the first one with 'Other'
# countries <- unique(income$`native-country`)
# income$`native-country` <- ifelse(income$`native-country` %in% countries[-1], "Other", income$`native-country`)
#
#
# #Create Dummy Variables
# income <- income %>%
# dummy_cols(select_columns = setdiff(names(income), "income_above_50k"), remove_selected_columns = T)
#
# #income %>%
# #dummy_cols(remove_selected_columns = T)
#
# #timer
# start_time <- Sys.time()
#
#
# # specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
# ctrl <- trainControl(method = "cv", number = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
# set.seed(504)
#
# income_index <- createDataPartition(income$income_above_50k, p = 0.80, list = FALSE)
# train <- income[ income_index, ]
# test <- income[-income_index, ]
#
# # example spec for rf
# fit <- train(income_above_50k ~ .,
# data = train,
# method = "rf",
# ntree = 20,
# tuneLength = 3,
# metric = "ROC",
# trControl = ctrl)
#
# fit
#
# confusionMatrix(predict(fit, test),factor(test$income_above_50k))
#
#
# #end timer
# end_time <- Sys.time()
# time_taken <- end_time - start_time
# print(time_taken)
# print(as.numeric(time_taken, units = "mins"))
##Tuesday pt3 w/EDA pt2 variables RF Kappa .46
#
# income = read_csv("./openml_1590.csv", na=c("?")) %>%
# drop_na() %>%
# mutate(income_above_50k = class==">50K") %>%
# select(-class) #%>%
# #dummy_cols(remove_selected_columns = T)
#
# #income$income_above_50k <- as.factor(income$income_above_50k)
# income$income_above_50k <- factor(income$income_above_50k, levels = c(FALSE, TRUE), labels = c("Below_50K", "Above_50K"))
#
# #New adds
#
# # Bin the 'age' column
# income <- income %>%
# mutate(age_group = case_when(
# age >= 0 & age <= 25 ~ "young",
# age > 25 & age <= 65 ~ "prime",
# age > 65 & age <= 100 ~ "retired"
# )) %>%
# select(-age)
#
# # Create 'Capital Diff' column and remove 'Capital Gain' and 'Capital Loss' columns
# income <- income %>%
# mutate(capital_diff = `capital-gain` - `capital-loss`) %>%
# select(-`capital-gain`, -`capital-loss`)
#
# # Bin the 'Capital Diff' column
# income <- income %>%
# mutate(capital_diff = case_when(
# capital_diff >= -5000 & capital_diff <= 5000 ~ "Minor",
# capital_diff > 5000 & capital_diff <= 100000 ~ "Major"
# ))
#
# #drop Columns
# income <- income %>% select(-fnlwgt)
#
# income <- income %>%
# filter(workclass != "Without-pay")
#
# # Bin 'Hours per Week' column
# income <- income %>%
# mutate(hours_per_week = case_when(
# `hours-per-week` >= 0 & `hours-per-week` <= 32 ~ "part_time",
# `hours-per-week` > 32 & `hours-per-week` <= 40 ~ "full_time",
# `hours-per-week` > 40 & `hours-per-week` <= 100 ~ "overtime"
# )) %>%
# select(-`hours-per-week`)
#
# #Bin Education
# income <- income %>%
# select(-`education-num`) %>%
# mutate(education = recode(education,
# "11th" = "School",
# "9th" = "School",
# "7th-8th" = "School",
# "5th-6th" = "School",
# "10th" = "School",
# "1st-4th" = "School",
# "Preschool" = "School",
# "12th" = "School"
# ))
#
# #Race
# income <- income %>%
# mutate(race = recode(race,
# "Black" = "Other",
# "Asian-Pac-Islander" = "Other",
# "Amer-Indian-Eskimo" = "Other",
# "Other" = "Other"
# ))
#
# # Replace all countries except the first one with 'Other'
# countries <- unique(income$`native-country`)
# income$`native-country` <- ifelse(income$`native-country` %in% countries[-1], "Other", income$`native-country`)
#
#
# #Education further binned
# income <- income %>%
# mutate(education = recode(education,
# "School" = "Some_college_orless",
# "Some-college" = "Some_college_orless",
# "HS-grad" = "Some_college_orless",
# "Assoc-voc" = "Some_college_orless",
# "Assoc-acdm" = "Some_college_orless",
# "Masters" = "Master_plus",
# "Prof-school" = "Master_plus",
# "Doctorate" = "Master_plus"
# ))
#
# #occupation further binned
# income <- income %>%
# mutate(occupation = recode(occupation,
# "Exec-managerial" = "high_tier",
# "Prof-specialty" = "high_tier",
# "Sales" = "mid_tier",
# "Craft-repair" = "mid_tier",
# "Adm-clerical" = "mid_tier",
# "Transport-moving" = "low_tier",
# "Tech-support" = "low_tier",
# "Machine-op-inspct" = "low_tier",
# "Protective-serv" = "low_tier",
# "Other-service" = "low_tier",
# "Farming-fishing" = "low_tier",
# "Handlers-cleaners" = "low_tier",
# "Armed-Forces" = "low_tier",
# "Priv-house-serv" = "low_tier"
# ))
#
# #further binned marital status
# income <- income %>%
# mutate(`marital_status` = recode(`marital-status`,
# "Never-married" = "never",
# "Married-civ-spouse" = "married",
# "Widowed" = "not_married",
# "Separated" = "not_married",
# "Divorced" = "not_married",
# "Married-spouse-absent" = "married",
# "Married-AF-spouse" = "married"
# )) %>%
# select(-`marital-status`)
#
#
#
#
# #Create Dummy Variables
# income <- income %>%
# dummy_cols(select_columns = setdiff(names(income), "income_above_50k"), remove_selected_columns = T)
#
# #income %>%
# #dummy_cols(remove_selected_columns = T)
#
# #timer
# start_time <- Sys.time()
#
#
# # specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
# ctrl <- trainControl(method = "cv", number = 3, classProbs = TRUE, summaryFunction = twoClassSummary)
# set.seed(504)
#
# income_index <- createDataPartition(income$income_above_50k, p = 0.80, list = FALSE)
# train <- income[ income_index, ]
# test <- income[-income_index, ]
#
# # example spec for rf
# fit <- train(income_above_50k ~ .,
# data = train,
# method = "rf",
# ntree = 20,
# tuneLength = 3,
# metric = "ROC",
# trControl = ctrl)
#
# fit
#
# confusionMatrix(predict(fit, test),factor(test$income_above_50k))
#
#
# #end timer
# end_time <- Sys.time()
# time_taken <- end_time - start_time
# print(time_taken)
# print(as.numeric(time_taken, units = "mins"))